{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "folder = \"/media/jie/新加卷/pku_data/中国工商注册企业信息(2023年9月更新）\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('../data/raw/北京.csv', nrows=10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>企业名称</th>\n",
       "      <th>英文名称</th>\n",
       "      <th>统一社会信用代码</th>\n",
       "      <th>企业类型</th>\n",
       "      <th>经营状态</th>\n",
       "      <th>成立日期</th>\n",
       "      <th>核准日期</th>\n",
       "      <th>法定代表人</th>\n",
       "      <th>注册资本</th>\n",
       "      <th>实缴资本</th>\n",
       "      <th>...</th>\n",
       "      <th>所属区县</th>\n",
       "      <th>网站链接</th>\n",
       "      <th>所属行业</th>\n",
       "      <th>一级行业分类</th>\n",
       "      <th>二级行业分类</th>\n",
       "      <th>三级行业分类</th>\n",
       "      <th>登记机关</th>\n",
       "      <th>经度</th>\n",
       "      <th>纬度</th>\n",
       "      <th>网址</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>北京高庆来蔬菜店</td>\n",
       "      <td>NaN</td>\n",
       "      <td>92110101MA00LT5D9E</td>\n",
       "      <td>个体工商户</td>\n",
       "      <td>注销</td>\n",
       "      <td>2005-06-06</td>\n",
       "      <td>2018-08-01</td>\n",
       "      <td>高庆来</td>\n",
       "      <td>1万人民币</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>东城区</td>\n",
       "      <td>NaN</td>\n",
       "      <td>零售业</td>\n",
       "      <td>农、林、牧、渔业</td>\n",
       "      <td>农业</td>\n",
       "      <td>谷物种植</td>\n",
       "      <td>北京市东城区市场监督管理局</td>\n",
       "      <td>116.439731</td>\n",
       "      <td>39.938575</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>北京徐忠伦蔬菜店</td>\n",
       "      <td>NaN</td>\n",
       "      <td>92110101MA04X3X25Q</td>\n",
       "      <td>个体工商户</td>\n",
       "      <td>注销</td>\n",
       "      <td>2011-07-07</td>\n",
       "      <td>2015-12-16</td>\n",
       "      <td>徐忠伦</td>\n",
       "      <td>0.3万人民币</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>东城区</td>\n",
       "      <td>NaN</td>\n",
       "      <td>零售业</td>\n",
       "      <td>农、林、牧、渔业</td>\n",
       "      <td>农业</td>\n",
       "      <td>谷物种植</td>\n",
       "      <td>北京市东城区市场监督管理局</td>\n",
       "      <td>116.439731</td>\n",
       "      <td>39.938575</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 32 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       企业名称 英文名称            统一社会信用代码   企业类型 经营状态        成立日期        核准日期  \\\n",
       "0  北京高庆来蔬菜店  NaN  92110101MA00LT5D9E  个体工商户   注销  2005-06-06  2018-08-01   \n",
       "1  北京徐忠伦蔬菜店  NaN  92110101MA04X3X25Q  个体工商户   注销  2011-07-07  2015-12-16   \n",
       "\n",
       "  法定代表人     注册资本 实缴资本  ...  所属区县 网站链接 所属行业    一级行业分类 二级行业分类 三级行业分类  \\\n",
       "0   高庆来    1万人民币  NaN  ...   东城区  NaN  零售业  农、林、牧、渔业     农业   谷物种植   \n",
       "1   徐忠伦  0.3万人民币  NaN  ...   东城区  NaN  零售业  农、林、牧、渔业     农业   谷物种植   \n",
       "\n",
       "            登记机关          经度         纬度  网址  \n",
       "0  北京市东城区市场监督管理局  116.439731  39.938575 NaN  \n",
       "1  北京市东城区市场监督管理局  116.439731  39.938575 NaN  \n",
       "\n",
       "[2 rows x 32 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['企业名称', '英文名称', '统一社会信用代码', '企业类型', '经营状态', '成立日期', '核准日期', '法定代表人',\n",
       "       '注册资本', '实缴资本', '参保人数', '公司规模', '经营范围', '注册地址', '营业期限', '纳税人识别号',\n",
       "       '工商注册号', '组织机构代码', '纳税人资质', '曾用名', '所属省份', '所属城市', '所属区县', '网站链接',\n",
       "       '所属行业', '一级行业分类', '二级行业分类', '三级行业分类', '登记机关', '经度', '纬度', '网址'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = [\n",
    "    \"企业名称\",\n",
    "    \"企业类型\", # \n",
    "    \"经营状态\", # 后续删除掉注销的企业\n",
    "    \"经营范围\",\n",
    "    \"所属行业\",\n",
    "    \"一级行业分类\",\n",
    "    \"二级行业分类\",\n",
    "    \"三级行业分类\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[columns]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>企业名称</th>\n",
       "      <th>企业类型</th>\n",
       "      <th>经营状态</th>\n",
       "      <th>经营范围</th>\n",
       "      <th>所属行业</th>\n",
       "      <th>一级行业分类</th>\n",
       "      <th>二级行业分类</th>\n",
       "      <th>三级行业分类</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>北京高庆来蔬菜店</td>\n",
       "      <td>个体工商户</td>\n",
       "      <td>注销</td>\n",
       "      <td>零售新鲜蔬菜。</td>\n",
       "      <td>零售业</td>\n",
       "      <td>农、林、牧、渔业</td>\n",
       "      <td>农业</td>\n",
       "      <td>谷物种植</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>北京徐忠伦蔬菜店</td>\n",
       "      <td>个体工商户</td>\n",
       "      <td>注销</td>\n",
       "      <td>零售新鲜蔬菜。</td>\n",
       "      <td>零售业</td>\n",
       "      <td>农、林、牧、渔业</td>\n",
       "      <td>农业</td>\n",
       "      <td>谷物种植</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       企业名称   企业类型 经营状态     经营范围 所属行业    一级行业分类 二级行业分类 三级行业分类\n",
       "0  北京高庆来蔬菜店  个体工商户   注销  零售新鲜蔬菜。  零售业  农、林、牧、渔业     农业   谷物种植\n",
       "1  北京徐忠伦蔬菜店  个体工商户   注销  零售新鲜蔬菜。  零售业  农、林、牧、渔业     农业   谷物种植"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['注销', '吊销', '吊销，未注销', '存续', '正常', nan, '开业', '吊销，已注销', '在业', '迁出',\n",
       "       '撤销', '证书废止'], dtype=object)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['经营状态'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 初步筛选企业\n",
    "def filter_company(df):\n",
    "    idx1 = df['经营范围'].str.contains('氢', na=False)\n",
    "    idx2 = df['经营状态'].isin(['正常', '开业', '存续'])\n",
    "    # xxx & xxx\n",
    "    return idx1 & idx2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "idx1 = df['经营范围'].str.contains('氢', na=False)\n",
    "idx2 = df['经营状态'].isin(['正常', '开业', '存续'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx1.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1230     有关充电电池、通讯器材、磷酸二氢铵及其他化工原料进出口的业务联络。（不得开展经营活动收取费用。）\n",
       "8109    销售氯化氢；服装加工；信息咨询（不含中介服务）；销售化工产品（不含化学危险品）、针纺织品、五...\n",
       "Name: 经营范围, dtype: object"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[idx1][\"经营范围\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 32)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8109    True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filter_company(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[filter_company(df)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1, 32)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>企业名称</th>\n",
       "      <th>英文名称</th>\n",
       "      <th>统一社会信用代码</th>\n",
       "      <th>企业类型</th>\n",
       "      <th>经营状态</th>\n",
       "      <th>成立日期</th>\n",
       "      <th>核准日期</th>\n",
       "      <th>法定代表人</th>\n",
       "      <th>注册资本</th>\n",
       "      <th>实缴资本</th>\n",
       "      <th>...</th>\n",
       "      <th>所属区县</th>\n",
       "      <th>网站链接</th>\n",
       "      <th>所属行业</th>\n",
       "      <th>一级行业分类</th>\n",
       "      <th>二级行业分类</th>\n",
       "      <th>三级行业分类</th>\n",
       "      <th>登记机关</th>\n",
       "      <th>经度</th>\n",
       "      <th>纬度</th>\n",
       "      <th>网址</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8109</th>\n",
       "      <td>北京和祥工贸公司</td>\n",
       "      <td>Beijing Hexiang Industry and Trade Company</td>\n",
       "      <td>91110101102351530Y</td>\n",
       "      <td>股份合作制</td>\n",
       "      <td>存续</td>\n",
       "      <td>1994-01-27</td>\n",
       "      <td>2016-03-03</td>\n",
       "      <td>郝东利</td>\n",
       "      <td>50万人民币</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>东城区</td>\n",
       "      <td>NaN</td>\n",
       "      <td>纺织服装、服饰业</td>\n",
       "      <td>制造业</td>\n",
       "      <td>纺织服装、服饰业</td>\n",
       "      <td>机织服装制造</td>\n",
       "      <td>北京市东城区市场监督管理局</td>\n",
       "      <td>116.442469</td>\n",
       "      <td>39.887331</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 32 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          企业名称                                        英文名称  \\\n",
       "8109  北京和祥工贸公司  Beijing Hexiang Industry and Trade Company   \n",
       "\n",
       "                统一社会信用代码   企业类型 经营状态        成立日期        核准日期 法定代表人    注册资本  \\\n",
       "8109  91110101102351530Y  股份合作制   存续  1994-01-27  2016-03-03   郝东利  50万人民币   \n",
       "\n",
       "     实缴资本  ...  所属区县 网站链接      所属行业 一级行业分类    二级行业分类  三级行业分类           登记机关  \\\n",
       "8109  NaN  ...   东城区  NaN  纺织服装、服饰业    制造业  纺织服装、服饰业  机织服装制造  北京市东城区市场监督管理局   \n",
       "\n",
       "              经度         纬度  网址  \n",
       "8109  116.442469  39.887331 NaN  \n",
       "\n",
       "[1 rows x 32 columns]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
